2024-09-22source: Alzheimer’s Disease Dataset
# Load necessary libraries
library(readr) # import data
library(ggplot2) # data visualization
library(plotly) # interactive data visualization
library(tidyr) # data manipulation
library(dplyr) # data manipulation## Rows: 309 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GENDER, LUNG_CANCER
## dbl (14): AGE, SMOKING, YELLOW_FINGERS, ANXIETY, PEER_PRESSURE, CHRONIC DISE...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 309 × 16
## GENDER AGE SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE `CHRONIC DISEASE`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M 69 1 2 2 1 1
## 2 M 74 2 1 1 1 2
## 3 F 59 1 1 1 2 1
## 4 M 63 2 2 2 1 1
## 5 F 63 1 2 1 1 1
## 6 F 75 1 2 1 1 2
## 7 M 52 2 1 1 1 1
## 8 F 51 2 2 2 2 1
## 9 F 68 2 1 2 1 1
## 10 M 53 2 2 2 2 2
## # ℹ 299 more rows
## # ℹ 9 more variables: FATIGUE <dbl>, ALLERGY <dbl>, WHEEZING <dbl>,
## # `ALCOHOL CONSUMING` <dbl>, COUGHING <dbl>, `SHORTNESS OF BREATH` <dbl>,
## # `SWALLOWING DIFFICULTY` <dbl>, `CHEST PAIN` <dbl>, LUNG_CANCER <chr>
## spc_tbl_ [309 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ GENDER : chr [1:309] "M" "M" "F" "M" ...
## $ AGE : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
## $ SMOKING : num [1:309] 1 2 1 2 1 1 2 2 2 2 ...
## $ YELLOW_FINGERS : num [1:309] 2 1 1 2 2 2 1 2 1 2 ...
## $ ANXIETY : num [1:309] 2 1 1 2 1 1 1 2 2 2 ...
## $ PEER_PRESSURE : num [1:309] 1 1 2 1 1 1 1 2 1 2 ...
## $ CHRONIC DISEASE : num [1:309] 1 2 1 1 1 2 1 1 1 2 ...
## $ FATIGUE : num [1:309] 2 2 2 1 1 2 2 2 2 1 ...
## $ ALLERGY : num [1:309] 1 2 1 1 1 2 1 2 1 2 ...
## $ WHEEZING : num [1:309] 2 1 2 1 2 2 2 1 1 1 ...
## $ ALCOHOL CONSUMING : num [1:309] 2 1 1 2 1 1 2 1 1 2 ...
## $ COUGHING : num [1:309] 2 1 2 1 2 2 2 1 1 1 ...
## $ SHORTNESS OF BREATH : num [1:309] 2 2 2 1 2 2 2 2 1 1 ...
## $ SWALLOWING DIFFICULTY: num [1:309] 2 2 1 2 1 1 1 2 1 2 ...
## $ CHEST PAIN : num [1:309] 2 2 2 2 1 1 2 1 1 2 ...
## $ LUNG_CANCER : chr [1:309] "YES" "YES" "NO" "NO" ...
## - attr(*, "spec")=
## .. cols(
## .. GENDER = col_character(),
## .. AGE = col_double(),
## .. SMOKING = col_double(),
## .. YELLOW_FINGERS = col_double(),
## .. ANXIETY = col_double(),
## .. PEER_PRESSURE = col_double(),
## .. `CHRONIC DISEASE` = col_double(),
## .. FATIGUE = col_double(),
## .. ALLERGY = col_double(),
## .. WHEEZING = col_double(),
## .. `ALCOHOL CONSUMING` = col_double(),
## .. COUGHING = col_double(),
## .. `SHORTNESS OF BREATH` = col_double(),
## .. `SWALLOWING DIFFICULTY` = col_double(),
## .. `CHEST PAIN` = col_double(),
## .. LUNG_CANCER = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
## GENDER AGE SMOKING
## 0 0 0
## YELLOW_FINGERS ANXIETY PEER_PRESSURE
## 0 0 0
## CHRONIC DISEASE FATIGUE ALLERGY
## 0 0 0
## WHEEZING ALCOHOL CONSUMING COUGHING
## 0 0 0
## SHORTNESS OF BREATH SWALLOWING DIFFICULTY CHEST PAIN
## 0 0 0
## LUNG_CANCER
## 0
no missing values!! in the dataset
# Remove Symptoms columns because they are not relevant to the study
data <- data %>% select(-'YELLOW_FINGERS',-'FATIGUE',-'ALLERGY',-'WHEEZING',-'COUGHING', -'SHORTNESS OF BREATH', -'SWALLOWING DIFFICULTY', -'CHEST PAIN')
# Rename columns
data <- data %>% rename(ALCOHOL_CONSUMING = 'ALCOHOL CONSUMING',CHRONIC_DISEASE = 'CHRONIC DISEASE')
# Convert GENDER to numeric
data$GENDER <- as.factor(data$GENDER)
data$GENDER_num <- as.numeric(data$GENDER)
# # Convert LUNG_CANCER to numeric
data$LUNG_CANCER <- as.factor(data$LUNG_CANCER)
data$LUNG_CANCER_num <- as.numeric(data$LUNG_CANCER)
str(data)## tibble [309 × 10] (S3: tbl_df/tbl/data.frame)
## $ GENDER : Factor w/ 2 levels "F","M": 2 2 1 2 1 1 2 1 1 2 ...
## $ AGE : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
## $ SMOKING : num [1:309] 1 2 1 2 1 1 2 2 2 2 ...
## $ ANXIETY : num [1:309] 2 1 1 2 1 1 1 2 2 2 ...
## $ PEER_PRESSURE : num [1:309] 1 1 2 1 1 1 1 2 1 2 ...
## $ CHRONIC_DISEASE : num [1:309] 1 2 1 1 1 2 1 1 1 2 ...
## $ ALCOHOL_CONSUMING: num [1:309] 2 1 1 2 1 1 2 1 1 2 ...
## $ LUNG_CANCER : Factor w/ 2 levels "NO","YES": 2 2 1 1 1 2 2 2 1 2 ...
## $ GENDER_num : num [1:309] 2 2 1 2 1 1 2 1 1 2 ...
## $ LUNG_CANCER_num : num [1:309] 2 2 1 1 1 2 2 2 1 2 ...
# Correlation heatmap
plot_ly(x = colnames(corr_data),
y = rownames(corr_data),
z = corr_data,
type = "heatmap")# Problem : ไม่สามารถหาค่าที่มีความสัมพันธ์กันอย่างมีนัยสำคัญได้จากการดู heatmap โดยตรง
for (i in 1:ncol(corr_data)) {
for (j in 1:nrow(corr_data)) {
if (i != j) {
if (colnames(corr_data)[i] == "LUNG_CANCER_num" & rownames(corr_data)[j] != "LUNG_CANCER_num") {
print(paste("Correlation between", colnames(corr_data)[i], "and", rownames(corr_data)[j], "is", corr_data[i, j]))
}
}
}
}## [1] "Correlation between LUNG_CANCER_num and AGE is 0.0894645760662337"
## [1] "Correlation between LUNG_CANCER_num and SMOKING is 0.0581788858520387"
## [1] "Correlation between LUNG_CANCER_num and ANXIETY is 0.144947132887312"
## [1] "Correlation between LUNG_CANCER_num and PEER_PRESSURE is 0.186387631715407"
## [1] "Correlation between LUNG_CANCER_num and CHRONIC_DISEASE is 0.110891094642414"
## [1] "Correlation between LUNG_CANCER_num and ALCOHOL_CONSUMING is 0.288532803091731"
## [1] "Correlation between LUNG_CANCER_num and GENDER_num is 0.0672541746783065"
# tell the percentage for each bar in the plot at the top of the bar
ggplot(data, aes(x = ALCOHOL_CONSUMING, fill = LUNG_CANCER)) + geom_bar(position = "fill") + geom_text(stat = "count", aes(label = after_stat(count),group = LUNG_CANCER),position = position_fill(vjust = 0.5))ggplot(data, aes(x = PEER_PRESSURE, fill = LUNG_CANCER)) + geom_bar(position = "fill") + geom_text(stat = "count", aes(label = after_stat(count),group = LUNG_CANCER),position = position_fill(vjust = 0.5))ggplot(data, aes(x = ANXIETY, fill = LUNG_CANCER)) + geom_bar(position = "fill") + geom_text(stat = "count", aes(label = after_stat(count),group = LUNG_CANCER),position = position_fill(vjust = 0.5))สรุปจาก data visualization พบว่ามีความสัมพันธ์ทางสถิติระหว่างตัวแปรค่อนข้างต่ำระหว่าง ALCOHOL CONSUMING, PEER PRESSURE และ ANXIETY ในผู้เป็นและไม่เป็นโรค LUNG CANCER จากการดู Visualization
# logistic regression model
data$LUNG_CANCER <- as.factor(data$LUNG_CANCER)
model <- glm(LUNG_CANCER ~ GENDER+ AGE+ SMOKING+ ANXIETY+ PEER_PRESSURE+ CHRONIC_DISEASE+ ALCOHOL_CONSUMING, data = data, family = "binomial")
summary(model)##
## Call:
## glm(formula = LUNG_CANCER ~ GENDER + AGE + SMOKING + ANXIETY +
## PEER_PRESSURE + CHRONIC_DISEASE + ALCOHOL_CONSUMING, family = "binomial",
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.96635 2.23833 -4.453 8.48e-06 ***
## GENDERM -0.22015 0.46688 -0.472 0.637261
## AGE 0.01971 0.02286 0.862 0.388598
## SMOKING 0.37898 0.40907 0.926 0.354213
## ANXIETY 1.30636 0.46243 2.825 0.004728 **
## PEER_PRESSURE 1.67526 0.46296 3.619 0.000296 ***
## CHRONIC_DISEASE 1.43470 0.47169 3.042 0.002353 **
## ALCOHOL_CONSUMING 2.91488 0.58416 4.990 6.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.30 on 308 degrees of freedom
## Residual deviance: 169.29 on 301 degrees of freedom
## AIC: 185.29
##
## Number of Fisher Scoring iterations: 6
สรุปจาก statistic analysis ปัจจัยที่ส่งผลให้เกิด Lung Cancer อย่างมาก (p-value < 0.001) ประกอบด้วย Peer pressure และ Alcohol consuming ปัจจัยที่ส่งผลให้เกิด Lung Cancer มาก (p-value < 0.01) ประกอบด้วย Anxiety และ Chorionic disease